Single cell RNA-seq of circulating breast cancer cells from 
19 women with ER+/HER2- primary tumours. Analysis of 74 single candidate circultaing tumor cells (CTCs) from two representative ER+/HER2- patients (BRx-42 and BRx-82) and 14 triple negative patients were isolated via micromanipulation and subjected to single-cell RNA-sequencing. The goal is to distinguish groups of HER2- and HER2+ CTCs to look at drug susceptibility and targeted/combination based therapies. The method used for analysis was Seurat, an R toolkit for single cell genomics.

Citations:
Jordan, Nicole Vincent et al. “HER2 expression identifies dynamic functional states within circulating breast cancer cells.” Nature vol. 537,7618 (2016): 102-106. 

Butler, A., Hoffman, P., Smibert, P. et al. "Integrating single-cell transcriptomic data across different conditions, technologies, and species." Nat Biotechnol 36, 411–420 (2018).

Loading dataset

library(Seurat)
library(dplyr)
library(patchwork)
library(Matrix)
library(readr)
library(clusterProfiler)
library(org.Hs.eg.db)
library(plyr)


data <- readMM("/Users/ericabrophy/Documents/BMI511_translational/BreastCancerDataset/E-GEOD-75367.aggregated_filtered_counts.mtx")
data_col <- read_delim("/Users/ericabrophy/Documents/BMI511_translational/BreastCancerDataset/E-GEOD-75367.aggregated_filtered_counts.mtx_cols", 
                       "\t", escape_double = FALSE, col_names = FALSE, 
                       trim_ws = TRUE)
data_row <- read_delim("/Users/ericabrophy/Documents/BMI511_translational/BreastCancerDataset/E-GEOD-75367.aggregated_filtered_counts.mtx_rows", 
                           "\t", escape_double = FALSE, col_names = FALSE, 
                            trim_ws = TRUE)
metadata <- read_delim("/Users/ericabrophy/Documents/BMI511_translational/BreastCancerDataset/E-GEOD-75367.sdrf.txt", 
                       "\t", escape_double = FALSE, trim_ws = TRUE)

Reshaping data

data <- as.matrix(data)
data_row <- data_row[, -1]

rownames(data)<-data_row$X2
colnames(data)<-data_col$X1

data.temp = data
data.temp[data.temp==0] = NA
hist(log10(as.vector(data.temp)+ 1))

Assign actual gene names to IDs

 gene.df <- bitr(data_row$X2, fromType = "ENSEMBL",
                 toType = c("ENSEMBL", "SYMBOL"),
                 OrgDb = org.Hs.eg.db)

Pull out translated gene names

gene_symbol <- gene.df[,2]

Summary of the data

head(gene.df, n=5)
##           ENSEMBL   SYMBOL
## 1 ENSG00000000003   TSPAN6
## 2 ENSG00000000419     DPM1
## 3 ENSG00000000457    SCYL3
## 4 ENSG00000000460 C1orf112
## 5 ENSG00000000938      FGR
summary(gene.df, n=5)
##    ENSEMBL             SYMBOL         
##  Length:17265       Length:17265      
##  Class :character   Class :character  
##  Mode  :character   Mode  :character

Subsetting new data

intersection <- gene.df[,1] %in% rownames(data) 
data[intersection, ]
data_subset <- data[gene.df[,1], ]
data_subset <- as.data.frame(data_subset)

Match gene labels for both dataframes

genomic_idx <- match(rownames(data_subset), gene.df[,1])

Renaming rows of new data matrix to gene names

rownames(data_subset) = make.names(gene.df$SYMBOL, unique=TRUE)

The Seurat object serves as a container that contains both data and analysis

BCdata <- CreateSeuratObject(counts = data_subset, 
                             project = "BreastCancer", 
                             min.cells = 3, 
                             min.features = 200)
FeatureScatter(BCdata,"nCount_RNA","nFeature_RNA")

Check for MT genes -> QC

rownames(data_subset) = make.names(gene.df$SYMBOL, unique=TRUE)
head(grep(pattern = "^MT-", x = rownames(data_subset), value = TRUE))
BCdata[grep(pattern = "^MT-", x = rownames(data_subset))]

QC

BCdata[["percent.mt"]] <- PercentageFeatureSet(BCdata, pattern = "^MT-")
head(BCdata@meta.data, 5) #There are no MT genes in the dataset
##                     orig.ident nCount_RNA nFeature_RNA percent.mt
## BRx_111_1__10_08_15        BRx  2236252.6         5539          0
## BRx_111_1__10_19_15        BRx  2398740.9         5270          0
## BRx_111_2__10_08_15        BRx   828232.8         2165          0
## BRx_111_3__10_08_15        BRx   204546.6         3768          0
## BRx_129_2_111814           BRx   376391.6         5882          0
VlnPlot(BCdata, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

Normalize data

#multiplies this by a scale factor (10,000 by default), and log-transforms the result
BCdata <- NormalizeData(BCdata, normalization.method = "LogNormalize", scale.factor = 10000)
BCdata[["RNA"]]@data
##    [[ suppressing 32 column names 'BRx_111_1__10_08_15', 'BRx_111_1__10_19_15', 'BRx_111_2__10_08_15' ... ]]
##    [[ suppressing 32 column names 'BRx_111_1__10_08_15', 'BRx_111_1__10_19_15', 'BRx_111_2__10_08_15' ... ]]
##    [[ suppressing 32 column names 'BRx_111_1__10_08_15', 'BRx_111_1__10_19_15', 'BRx_111_2__10_08_15' ... ]]
barplot(BCdata@meta.data$nFeature_RNA)

Feature Selection

Plot variable features with and without labels

# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(BCdata), 10)
LabelPoints(plot = VariableFeaturePlot(BCdata), points = top10, repel = TRUE)

Scaling the data

#apply a linear transformation (‘scaling’) that is a standard pre-processing 
#step prior to dimensional reduction techniques like PCA. 
all.genes <- rownames(BCdata)
BCdata <- ScaleData(BCdata, features = all.genes)
BCdata[["RNA"]]@scale.data

PCA

BCdata <- RunPCA(BCdata, features = VariableFeatures(object = BCdata), npcs = 30, verbose = FALSE, approx=FALSE)
print(BCdata[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1 
## Positive:  TUBB1, RGS18, H2AC6, CAVIN2, CD226 
## Negative:  GGCT, NPM1, VDAC1, KRT7, UQCC2 
## PC_ 2 
## Positive:  ASRGL1, KYNU, NRK, LOX, AACS 
## Negative:  CRABP2, HOXB2, S100A16, CALML5, FLYWCH2 
## PC_ 3 
## Positive:  GABPB2, SAMHD1, ZNF37BP, KLRD1, SPG7 
## Negative:  PPBP, SCP2, SPARC, HPGD, SNN 
## PC_ 4 
## Positive:  KYNU, KLRD1, NOA1, RPGRIP1, PLAC8 
## Negative:  EME1, IL17RB, FAM72D, WDHD1, DTL 
## PC_ 5 
## Positive:  HBD, SLC4A1, CA1, HBM, AHSP 
## Negative:  TAPBP, NXPH1, FLYWCH2, EPHX2, MSH6
VizDimLoadings(BCdata, dims = 1:2, reduction = "pca")

DimPlot(BCdata, reduction = "pca")

DimHeatmap

#Easy exploration of the primary sources of heterogeneity in a dataset
#and can be useful when trying to decide which PCs to include for further downstream analyses
DimHeatmap(BCdata, dims = 1, cells = 500, balanced = TRUE)

DimHeatmap(BCdata, dims = 1:5, cells = 500, balanced = TRUE)

Jackstraw Plot

#a ranking of principle components based on the percentage of variance explained by each one
BCdata <- JackStraw(BCdata, num.replicate = 100)
BCdata <- ScoreJackStraw(BCdata, dims = 1:20)
JackStrawPlot(BCdata, dims = 1:15)

ElbowPlot(BCdata)

Clustering cells

BCdata <- FindNeighbors(BCdata, dims = 1:10)
BCdata <- FindClusters(BCdata, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 74
## Number of edges: 1366
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.6831
## Number of communities: 2
## Elapsed time: 0 seconds
head(Idents(BCdata), 5)
## BRx_111_1__10_08_15 BRx_111_1__10_19_15 BRx_111_2__10_08_15 BRx_111_3__10_08_15 
##                   0                   0                   0                   0 
##    BRx_129_2_111814 
##                   0 
## Levels: 0 1

UMAP

BCdata <- RunUMAP(BCdata, dims = 1:10)
DimPlot(BCdata, reduction = "umap")

Find all markers for a cluster -> cluster 1

cluster1.markers <- FindMarkers(BCdata, ident.1 = 1, min.pct = 0.25)
head(cluster1.markers, n = 5)
##              p_val avg_logFC pct.1 pct.2    p_val_adj
## GGCT  9.000078e-16 1.6405363 1.000 0.044 1.553864e-11
## MAL2  1.077508e-14 1.4468702 1.000 0.133 1.860318e-10
## SDC1  1.530493e-14 1.5576624 0.931 0.022 2.642396e-10
## UQCC2 3.380903e-14 0.6984841 0.966 0.067 5.837129e-10
## SNRPE 6.288319e-14 0.9916587 1.000 0.156 1.085678e-09

Find markers for every cluster compared to all remaining cells, report only the positive ones

BCdata.markers <- FindAllMarkers(BCdata, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
BCdata.markers %>% group_by(cluster) %>% top_n(n = 2, wt = avg_logFC)
## # A tibble: 4 x 7
## # Groups:   cluster [2]
##          p_val avg_logFC pct.1 pct.2 p_val_adj cluster gene 
##          <dbl>     <dbl> <dbl> <dbl>     <dbl> <fct>   <chr>
## 1 0.00000159        3.07 0.889 0.759   0.0275  0       TUBB1
## 2 0.000305          3.14 0.911 0.897   1       0       PPBP 
## 3 0.0000000680      2.94 0.759 0.244   0.00117 1       PIP  
## 4 0.000000933       2.68 0.793 0.467   0.0161  1       APOD

The ROC test returns the ‘classification power’ for any individual marker (ranging from 0 - random, to 1 - perfect)

cluster1.markers <- FindMarkers(BCdata, ident.1 = 0, logfc.threshold = 0.25, test.use = "roc", only.pos = TRUE)
head(cluster1.markers, n = 5)
##        myAUC avg_diff power pct.1 pct.2
## TAL1   0.935 1.830775 0.870 0.956 0.517
## LAT    0.917 1.885478 0.834 0.933 0.690
## SLC2A3 0.900 2.677472 0.800 0.933 0.724
## GMPR   0.900 1.330273 0.800 0.956 0.724
## CCL5   0.893 2.306072 0.786 1.000 0.897

Shows expression probability distributions across clusters

VlnPlot(BCdata, features = c("STX12", "HBA1"))

FeaturePlot(BCdata, features = c("STX12", "ATL3", "PTPA", "YIPF5", "MARCHF5", "HBD", "HBA1", "HBA2","HBB", "CA1"))

Generates an expression heatmap for given cells and features

top10 <- BCdata.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_logFC)
DoHeatmap(BCdata, features = top10$gene) + NoLegend()

Naming clusters

new.cluster.ids <- c("HER2-", "HER2+")
names(new.cluster.ids) <- levels(BCdata)
BCdata <- RenameIdents(BCdata, new.cluster.ids)
plot <- DimPlot(BCdata, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()
HoverLocator(plot = plot, information = FetchData(BCdata, vars = c("ident", "PC_1", "nFeature_RNA")))

TSNE

BCdata <- RunTSNE(object = BCdata, dims = 1:2, resolution = 0.5, perplexity = 10)
plot <- DimPlot(object = BCdata, reduction = "tsne", label = TRUE)
HoverLocator(plot = plot, information = FetchData(BCdata, vars = c("ident", "PC_1", "nFeature_RNA")))

Dotplot of genes identified showing expression levels in the different clusters

features <- c("STX12", "ATL3", "PTPA", "YIPF5", "MARCHF5", "HBD", "HBA1", "HBA2","HBB", "CA1")
features2 <- c("APOD", "PIP", "CRABP2", "XBP1", "HSP90AB1", "F13A1", "TUBB1", "RGS18", "H2AC6", "CAVIN2", "CD226")
DotPlot(BCdata, features = features) + RotatedAxis()

DotPlot(BCdata, features = features2) + RotatedAxis()

Heatmap showing expression levels of genes identified in clusters

DoHeatmap(subset(BCdata, downsample = 100), features = features2, size = 3)

Interactive Plot to show NOTCH1 expression in data

plot <- FeaturePlot(BCdata, features = "NOTCH1")
HoverLocator(plot = plot, information = FetchData(BCdata, vars = c("ident", "PC_1", "nFeature_RNA")))

Shiny App URL

#https://ebrophy.shinyapps.io/SingleCellBreastCancer_Heatmap/